K-Folds Process
Setup
Values will differ from last submission due to new random sampling of the data for train/test sets.
library(tidyverse)
library(dplyr)
library(caret)
library(class)
library(yardstick)
library(plotly)
library(boot)
library(pROC)
library(glmnet)
library(purrr)
library(gridExtra)
library(randomForest)
library(e1071)
install.packages("kernlab")
#reading in the data
data <- read.csv("HaitiPixels.csv", header=TRUE ,sep=",")
data <- data %>%
mutate(BlueClass = as.factor(ifelse(Class=="Blue Tarp","Yes", "No")))
#check the levels just specified
levels(data$BlueClass)
## [1] "No" "Yes"
#set data var to be columns 2-5 of the set
data = data[c(2:5)]
data <- data %>% mutate(id = row_number())
#check addition
head(data$id)
## [1] 1 2 3 4 5 6
#shuffle data to fairly split into test / train
shuffleddata = sample_n(data, nrow(data))
#check that it has been shuffled
head(shuffleddata$id) #different then the first six lines of the csv file
## [1] 33926 61364 5446 61528 61607 30568
#remove the id column
shuffleddata = shuffleddata[c(1:4)]
#split the data into test and train for use in our upcoming models
#using a 10k subset for faster knn function execution and file freezing issues
samp <- 1:10000
samp2 <- 10001:20000
train<-shuffleddata[samp,]
test <- shuffleddata[samp2,]
head(train)
## Red Green Blue BlueClass
## 1 255 233 166 No
## 2 168 197 239 Yes
## 3 72 71 51 No
## 4 219 255 255 Yes
## 5 222 254 255 Yes
## 6 255 240 185 No
head(test)
## Red Green Blue BlueClass
## 10001 255 255 211 No
## 10002 194 177 142 No
## 10003 105 103 75 No
## 10004 251 210 161 No
## 10005 255 255 203 No
## 10006 255 229 176 No
KNN
#model training rules for all models
train_control <- caret::trainControl(method="cv", number=10, returnResamp='all', classProbs=TRUE, savePredictions='final')
#KNN model
system.time({
knnmod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="knn",preProcess = c("center","scale"), tuneGrid = expand.grid(k = c(1:15)))
})
## user system elapsed
## 16.546 0.131 16.747
knnmod
## k-Nearest Neighbors
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9001, 9000, 9000, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9969000 0.9492275
## 2 0.9966000 0.9442136
## 3 0.9973000 0.9564079
## 4 0.9966998 0.9460288
## 5 0.9969998 0.9504717
## 6 0.9972999 0.9555952
## 7 0.9972999 0.9556936
## 8 0.9969999 0.9506282
## 9 0.9969999 0.9507180
## 10 0.9971999 0.9540502
## 11 0.9969999 0.9505251
## 12 0.9967000 0.9453658
## 13 0.9966999 0.9453629
## 14 0.9967999 0.9471504
## 15 0.9967999 0.9471504
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 3.
#plot KNN model
plot(knnmod)

#set prediction, probability, and cv score variables in case needed
knnmod_pred <- predict(knnmod, test,'raw')
knnmod_prob <- predict(knnmod, test,'prob')
knnmod_scored <- cbind(test, knnmod_pred, knnmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
knn_auc = knnmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
## Warning: The `yardstick.event_first` option has been deprecated as of yardstick 0.0.7 and will be completely ignored in a future version.
## Instead, set the following argument directly in the metric function:
## `options(yardstick.event_first = TRUE)` -> `event_level = 'first'` (the default)
## `options(yardstick.event_first = FALSE)` -> `event_level = 'second'`
## This warning is displayed once per session.
knn_auc
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.997
#ROC curve + plot
ROC_curve<-knnmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('KNN ROC curve')
ggplotly(ROC_curve_plot)
#set threshold
knnmod_pred2 <- knnmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .66, 99.7% best.
mutate(prediction = ifelse(Yes>.66, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#confusion matrix
confusionMatrix(knnmod_pred2$prediction, knnmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9666 12
## Yes 15 307
##
## Accuracy : 0.9973
## 95% CI : (0.9961, 0.9982)
## No Information Rate : 0.9681
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9565
##
## Mcnemar's Test P-Value : 0.7003
##
## Sensitivity : 0.9624
## Specificity : 0.9985
## Pos Pred Value : 0.9534
## Neg Pred Value : 0.9988
## Prevalence : 0.0319
## Detection Rate : 0.0307
## Detection Prevalence : 0.0322
## Balanced Accuracy : 0.9804
##
## 'Positive' Class : Yes
##
LDA
#LDA model
system.time({
ldamod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="lda",preProcess = c("center","scale"), family="binomial")
})
## user system elapsed
## 0.826 0.012 0.847
ldamod
## Linear Discriminant Analysis
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 8999, 9000, 9000, 9000, 9001, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9844998 0.7611666
#set prediction, probability, and cv score variables in case needed
ldamod_pred <- predict(ldamod, test,'raw')
ldamod_prob <- predict(ldamod, test,'prob')
ldamod_scored <- cbind(test, ldamod_pred, ldamod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
lda_auc = ldamod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
lda_auc
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.990
#ROC curve + plot
ROC_curve2<-ldamod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot2 <- ROC_curve2 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot2)
#set new threshold
ldamod_pred2 <- ldamod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .85, 98.6% best.
mutate(prediction = ifelse(Yes>.85, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(ldamod_pred2$prediction, ldamod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9609 79
## Yes 72 240
##
## Accuracy : 0.9849
## 95% CI : (0.9823, 0.9872)
## No Information Rate : 0.9681
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7529
##
## Mcnemar's Test P-Value : 0.6254
##
## Sensitivity : 0.7524
## Specificity : 0.9926
## Pos Pred Value : 0.7692
## Neg Pred Value : 0.9918
## Prevalence : 0.0319
## Detection Rate : 0.0240
## Detection Prevalence : 0.0312
## Balanced Accuracy : 0.8725
##
## 'Positive' Class : Yes
##
QDA
#QDA model
system.time({
qdamod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="qda",preProcess = c("center","scale"), family="binomial")
})
## user system elapsed
## 0.789 0.002 0.796
qdamod
## Quadratic Discriminant Analysis
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 8999, 9000, 9000, 9000, 9000, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9951 0.9135447
##set prediction, probability, and cv score variables in case needed
qdamod_pred <- predict(qdamod, test,'raw')
qdamod_prob <- predict(qdamod, test,'prob')
qdamod_scored <- cbind(test, qdamod_pred, qdamod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
qda_auc = qdamod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
qda_auc
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.998
#ROC curve + plot
ROC_curve3<-qdamod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot3 <- ROC_curve3 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot3)
#set new threshold
qdamod_pred2 <- qdamod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .40, 99.5% best.
mutate(prediction = ifelse(Yes>.40, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(qdamod_pred2$prediction, qdamod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9654 47
## Yes 27 272
##
## Accuracy : 0.9926
## 95% CI : (0.9907, 0.9942)
## No Information Rate : 0.9681
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8764
##
## Mcnemar's Test P-Value : 0.0272
##
## Sensitivity : 0.8527
## Specificity : 0.9972
## Pos Pred Value : 0.9097
## Neg Pred Value : 0.9952
## Prevalence : 0.0319
## Detection Rate : 0.0272
## Detection Prevalence : 0.0299
## Balanced Accuracy : 0.9249
##
## 'Positive' Class : Yes
##
LR
#GLM model
system.time({
glmmod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="glm",preProcess = c("center","scale"), family="binomial")
})
## user system elapsed
## 1.448 0.004 1.460
glmmod
## Generalized Linear Model
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9000, 9000, 9000, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9961994 0.9356128
##et prediction, probability, and cv score variables in case needed
glmmod_pred <- predict(glmmod, test,'raw')
glmmod_prob <- predict(glmmod, test,'prob')
glmmod_scored <- cbind(test, glmmod_pred, glmmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
qda_auc = glmmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
qda_auc
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.998
#ROC curve + plot
ROC_curve4<-glmmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
glmmod_pred2 <- glmmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .29, 99.6% best.
mutate(prediction = ifelse(Yes>.29, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(glmmod_pred2$prediction, glmmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9669 23
## Yes 12 296
##
## Accuracy : 0.9965
## 95% CI : (0.9951, 0.9976)
## No Information Rate : 0.9681
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9424
##
## Mcnemar's Test P-Value : 0.09097
##
## Sensitivity : 0.9279
## Specificity : 0.9988
## Pos Pred Value : 0.9610
## Neg Pred Value : 0.9976
## Prevalence : 0.0319
## Detection Rate : 0.0296
## Detection Prevalence : 0.0308
## Balanced Accuracy : 0.9633
##
## 'Positive' Class : Yes
##
RF
#Random Forest model
#Choosing tuning parameters:
#https://discuss.analyticsvidhya.com/t/how-to-decide-no-of-ntrees-in-randomforest/6882/3
#https://rpubs.com/phamdinhkhanh/389752
#Create control function for training with 10 folds and keep 3 folds for training.
train_control <- caret::trainControl(method="cv", number=10, returnResamp='all', classProbs=TRUE, savePredictions='final')
#https://stackoverflow.com/questions/10085806/extracting-specific-columns-from-a-data-frame
df<- train %>%
select(Red,Green,Blue)
#mtryStart defaults at sqrt(p)
#my available threshold for mtry values is pretty low based on the size of my dataset
(tuneRF(df,train$BlueClass,mtry = 5, ntree = 500, stepFactor=5, improve=0.05,
trace=TRUE, plot=TRUE, doBest=TRUE))
## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## invalid mtry: reset to within valid range
## mtry = 5 OOB error = 0.27%
## Searching left ...
## mtry = 1 OOB error = 0.27%
## 0 0.05
## Searching right ...
## mtry = 3 OOB error = 0.27%
## 0 0.05

##
## Call:
## randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 0.26%
## Confusion matrix:
## No Yes class.error
## No 9676 5 0.0005164756
## Yes 21 298 0.0658307210
#mtry = 3
##------
tunegrid <- expand.grid(.mtry = 3)
modellist <- list()
#train with different ntree parameters and inspect bias/variance tradeoff
#findtrees1 <- train(BlueClass~Red+Green+Blue,
# data=train,
# method = 'rf',
# metric = 'Accuracy',
# tuneGrid = tunegrid,
# trControl = control,
# ntree = 50)
#findtrees1
#findtrees2 <- train(BlueClass~Red+Green+Blue,
# data=train,
# method = 'rf',
# metric = 'Accuracy',
# tuneGrid = tunegrid,
# trControl = control,
# ntree = 100)
#findtrees2
system.time({
RF <- train(BlueClass~Red+Green+Blue,
data=train,
method = 'rf',
metric = 'Accuracy',
tuneGrid = tunegrid,
trControl = train_control,
ntree = 500)
})
## user system elapsed
## 12.824 1.271 14.356
RF
## Random Forest
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9000, 8999, 9000, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9974998 0.9588549
##
## Tuning parameter 'mtry' was held constant at a value of 3
##et prediction, probability, and cv score variables in case needed
rfmod_pred <- predict(RF, test,'raw')
rfmod_prob <- predict(RF, test,'prob')
rfmod_scored <- cbind(test, rfmod_pred, rfmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
rf_auc = rfmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
rf_auc
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.996
#ROC curve + plot
ROC_curve4<-rfmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
rfmod_pred2 <- RF$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .50, 99.7% best.
mutate(prediction = ifelse(Yes>.50, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(rfmod_pred2$prediction, rfmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9672 16
## Yes 9 303
##
## Accuracy : 0.9975
## 95% CI : (0.9963, 0.9984)
## No Information Rate : 0.9681
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9591
##
## Mcnemar's Test P-Value : 0.2301
##
## Sensitivity : 0.9498
## Specificity : 0.9991
## Pos Pred Value : 0.9712
## Neg Pred Value : 0.9983
## Prevalence : 0.0319
## Detection Rate : 0.0303
## Detection Prevalence : 0.0312
## Balanced Accuracy : 0.9745
##
## 'Positive' Class : Yes
##
SVM
#Choosing tuning parameters:
#Linear
set.seed(1)
tune.out.linear=tune(svm,BlueClass~Red+Green+Blue,data=train,kernel="linear",ranges=list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)))
#Radial
tune.out.radial=tune(svm, BlueClass~Red+Green+Blue,data=train, kernel="radial", ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
#lowest error is radial cost 10 gamma 1
#Poly
tune.out.poly=tune(svm,BlueClass~Red+Green+Blue,data=train, kernel="polynomial", ranges=list(cost=c(0.1,1,10,100,1000),degree=c(1,2,3,4,5)))
##------
system.time({
svmmod <- train(BlueClass~Red+Green+Blue,
data=train,
method = 'svmRadial',
metric = 'Accuracy',
trControl = train_control,
cost = 10,
gamma = 1,
preProcess = c("center","scale")
)
})
## user system elapsed
## 18.414 0.121 18.621
#Set prediction, probability, and cv score variables in case needed
svmmod_pred <- predict(svmmod, test,'raw')
svmmod_prob <- predict(svmmod, test,'prob')
svmmod_scored <- cbind(test, svmmod_pred, svmmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
svmmod_auc = svmmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
svmmod_auc
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.999
#ROC curve + plot
ROC_curve4<-svmmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
svmmod_pred2 <- svmmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .52, 99.7% best.
mutate(prediction = ifelse(Yes>.52, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(svmmod_pred2$prediction, svmmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9670 20
## Yes 11 299
##
## Accuracy : 0.9969
## 95% CI : (0.9956, 0.9979)
## No Information Rate : 0.9681
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9491
##
## Mcnemar's Test P-Value : 0.1508
##
## Sensitivity : 0.9373
## Specificity : 0.9989
## Pos Pred Value : 0.9645
## Neg Pred Value : 0.9979
## Prevalence : 0.0319
## Detection Rate : 0.0299
## Detection Prevalence : 0.0310
## Balanced Accuracy : 0.9681
##
## 'Positive' Class : Yes
##
Tuning parameter selection process
- Interpretation of your chosen tuning parameter values:
- Ntree = 500 indicates that 500 trees were created. I understand that it is important for this number to be substantial to reduce variance, but as we know, that can also lead to bias. There should be a sweet spot where every input row gets predicted at least a few times without overfitting the trees. Mtry=5 indicates that 5 variables were randomly sampled at each split. When mtry=p it can essentially equate to bagging, whereas if its set to 1 it essentially chooses a random variable. I understand it is good to try out a few values that range no smaller than 2 and no larger than p. The gamma=1 svm parameter indicates that a single training example has far reaching influence. A cost = 10 here means we are "paying" a high price for higher accuracy.
- Explanation of how your chosen tuning parameter values were chosen:
- I tried to make this selection process as programmatic as possible. There was a lot of learning and I'm sure there is still a healthy amount that should be corrected within the code but I was glad to find the resources that I did. For mtry I used the tuneRF() function that accepts an initial value for mtry and returns the out-of-bag error for your input value as well as a few surrounding values. I chose mtry because it produced the lowest oob error and went on to choose ntrees from there. At that point I included my mtry=3 into several RF models with ntrees of different values and again looked for the highest accuracy. Ntrees=500 was the winner in that sense, and it appears to be a very commonly used value for that parameter. For the SVM parameters I used our class lab as a guide to run the tune() function for linear, radial and polynomial kernels with several cost, gamma and degree values respectively. The tune function returns a best performer, which was the radial kernel with cost = 10 and gamma = 1. When running the train function the sigma = 8.691262 and C = 1 values were returned as contributing to the highest accuracy and therefore were the most optimal values for those parameters.
Conclusions
- A discussion of the best performing algorithm(s) in the cross-validation and hold-out data
- The best performing algorithm in the cross validation spectrum would be GLM or QDA. Both run in under 1.5 seconds to the user (up to ~15 seconds faster than some of the others) and still have an accuracy over 99.4%. The hold out data general ran very slowly, which could be due to the size of the dataset, but given the consistently high accuracy and easy of use of these two functions I would be even more likely to advocate for them in hold out setting.
- A recommendation and rationale regarding which algorithm to use for detection of blue tarps
- For the detection of blue tarps I’d recommend using the radial kernel svm function. Its valuable to tune your parameters according to your data and how you may define accuracy. Particularly with this pixel data, you could lower the threshold for determining a blue tarp knowing that it goes beyond the original tarp limits, but avoids the possibility of neglecting a human in need. The reason I became partial to these functions in the context of a natural disaster, is because it can give you a starting point for additional tuning. Accuracy and automation are two things I would assume are extremely valuable in the wake of a natural disaster.
- Additional thoughts and questions:
- I’ve done some research to learn more about how RGB average thresholds are set in order to determine that it is a “green” vegetation area or “red” soil area. I understand from a paper titled “Geospatial Disaster Response during the Haiti Earthquake: A Case Study Spanning Airborne Deployment, Data Collection, Transfer, Processing, and Dissemination” that a Mahalanobis distance classifier was trained using a collection of known blue tarp pixels, the RGB values of the pixels in the photos were averaged, and that’s how the other pixels are determined to be blue tarp or not to be blue tarp. My issue is largely that the average of those three values is not exclusive to the color. I read that “RGB (Red, Green, Blue) are 8 bit each. The range for each individual colour is 0-255 (as 2^8 = 256 possibilities). The combination range is 256256256. By dividing by 255, the 0-255 range can be described with a 0.0-1.0 range where 0.0 means 0 (0x00) and 1.0 means 255 (0xFF).” So how could the average of these three values indicate something is blue over it being red, when the average for something being ‘true’ red (255,0,0) would have the same average as something being ‘true’ green or blue at (0,255,0) or (0,0,255) respectively. It seems more logical for there to be a lower bound limit to what could be considered blue, in addition to it being at least 1.2x the values of the other two for example. There are studies around accessibility and exactly what shade behins to be unseeable by someone who is color blind, how do they determine their thresholds? That will be my next rabbit hole.